### Analysis for individual-level portion of prison-voting project 
## Replication package for Maine/Vermont prison voting paper, September 2020
## Email Ariel White with questions: arwhi@mit.edu

rm(list=ls())
library(data.table)
setwd("/home/ariel/Dropbox (MIT)/PrisonVoting/replicationdata") #set local filepath
load("VTmergeddata_foranalysis.Rdata") #"finaldat"

finaldata <- finaldat

#################################################################################################################
# calculate turnout rates for 2016, 2018 for people who were incarcerated then (varying assumptions)
# ("in2018"=1 indicates that book date falls before election day 2018)
table(finaldata$in2016); table(finaldata$in2018)

voted2018tight <- sum(finaldata[manualmatch==1 & in2018==1,"voted18"] , na.rm=T) #only count definite matches
voted2018loose <- sum(finaldata[manualmatch>0 & in2018==1,"voted18"], na.rm=T) #include handful of "uncertain" matches
voted2018tight;voted2018loose

turnout18_lower <- voted2018tight/nrow(finaldata[in2018==1])
turnout18_higher <- voted2018loose/nrow(finaldata[in2018==1])
turnout18_lower; turnout18_higher

registered2018tight <- sum(finaldata[manualmatch==1 & in2018==1,"registeredvoter"], na.rm=T) #only count definite matches
registered2018loose <- sum(finaldata[manualmatch>0 & in2018==1,"registeredvoter"], na.rm=T) #include these "uncertain" matches
reg18_lower <- registered2018tight/nrow(finaldata[in2018==1])
reg18_higher <- registered2018loose/nrow(finaldata[in2018==1])
reg18_lower; reg18_higher

#also, what if we just look at raw turnout among people incarcerated as of Feb 2019 (not trying to use book date)
alt18turnout <- sum(finaldata[manualmatch>0, "voted18"], na.rm=T)/ nrow(finaldata)
alt18turnout #about the same. 

#also, what if we drop eight people from the denominator, to account for the approx. number of ineligible noncitizens in the data?
turnout18_higher_dropnoncit <- voted2018loose/(nrow(finaldata[in2018==1])-8)
turnout18_higher_dropnoncit

#also, what if ~10% of the pop were non-residents and ineligible to vote?
turnout18_higher_dropnonres <- voted2018loose/(nrow(finaldata[in2018==1])*.9)

#now, 2016 turnout (though we of course don't have a full census there, we're just looking back at 2016 from 2019)
voted2016tight <- sum(finaldata[manualmatch==1 & in2016==1,"voted16"], na.rm=T) 
voted2016loose <- sum(finaldata[manualmatch>0 & in2016==1,"voted16"], na.rm=T) 
voted2016tight;voted2016loose

turnout16_lower <- voted2016tight/nrow(finaldata[in2016==1])
turnout16_higher <- voted2016loose/nrow(finaldata[in2016==1])
turnout16_lower; turnout16_higher

#again, look at everyone incarcerated
alt16turnout <- sum(finaldata[manualmatch>0, "voted16"], na.rm=T)/ nrow(finaldata)
alt16turnout 


#################################################################################################################
# Take a look at R1 Q3: distribution of reg dates and facilities (was it just one registration drive?)
registered2018loose; voted2018loose

#I'm commenting the below code that examines specific registration dates (not years) because the deidentified dataset only includes reg. year 
#finaldata$registrationdate <- as.Date(finaldata$Date.of.Registration, "%m/%d/%Y")
#summary(finaldata[in2018==1 & registeredvoter==1,]$registrationdate)
#registrationdates_2018registrants <- table(finaldata[in2018==1 & registeredvoter==1,]$registrationdate)
#dim(registrationdates_2018registrants) #282 unique values
#head(sort(registrationdates_2018registrants)); tail(sort(registrationdates_2018registrants))
##how many of these registrants have any sort of date (as opposed to NA)?
#registrantsin2018 <- finaldata[in2018==1 & registeredvoter==1,]
#sum(is.na(registrantsin2018$registrationdate)) #just one NA?
#sum(registrationdates_2018registrants) #yeah, checks out. 

#now look at registration year (Table 2 in SI section 2.3)
registrationyears_2018voters <- as.data.frame(table(finaldata[in2018==1 & manualmatch==1 & voted18==1,]$registrationyear))
colnames(registrationyears_2018voters) <- c("Year","Count")
library(xtable)
print(xtable(registrationyears_2018voters, caption="Registration years for people voting from Vermont prisons in 2018", label="registrationyears"), include.rownames=F, 
      file="registrationyears_2018voters.tex")

#now look at locations (Table 3 in SI section 2.3)
facilities_2018voters <- as.data.frame(table(finaldata[in2018==1 & manualmatch==1 & voted18==1,]$Location))
colnames(facilities_2018voters) <- c("Facility", "County")
#https://doc.vermont.gov/custody-supervision/facilities
print(xtable(facilities_2018voters, caption="Locations of people voting from Vermont prisons in 2018", label="prisonlocations"), include.rownames=F, file="prisonlocations_2018voters.tex")

#################################################################################################################
#also, look at R1's Q about sentences/crime types (for everyone, or comparing voters/not?)
dim(sort(table(finaldata$Crime.1)))
tail(sort(table(finaldata$Crime.1)))
tail(sort(table(finaldata[in2018==1 & voted18==1]$Crime.1))) 
tail(sort(table(finaldata[in2018==1 & voted18==0]$Crime.1)))
tail(sort(table(finaldata[registeredvoter==1]$Crime.1))) 

#calculate some rough semblance of sentence length: how long have people been here, what's their max time to release?
#finaldata[, maxreleasedate := as.Date(Maximum.Release.Date, "%m/%d/%Y %H:%M")]
#finaldata[, maxreleasetime := (maxreleasedate - as.Date("2018-11-06"))/365.25 ] #calculate max remaining sentence in years
#summary(as.numeric(finaldata$maxreleasetime))

#(I've commented out the above code because it now gets run before outputting the de-identified dataset, so I can drop exact release dates from the public data)
#(but the code shows how maxreleasetime was constructed: note that it's in *years*, though the variable says days)
summary(as.numeric(finaldata[in2018==1 & voted18==1]$maxreleasetime))
summary(as.numeric(finaldata[in2018==1 & voted18==0]$maxreleasetime))


#also, per R question: look at demographics of full VT dataset versus registered/voters?
table(finaldata$Race)
#what does this look like for people we think are registered or are sure voted?
table(finaldata[registeredvoter==1]$Race)

table(finaldata[in2016==1 & voted16==1]$Race)
table(finaldata[in2018==1 & voted18==1]$Race)
#idk, these are so few people. 

nonvoters <- finaldata[in2018==1 & (voted18==0 | is.na(voted18)) & !(Race=="") & !(Race=="Unknown")]
voters <- finaldata[in2018==1 & voted18==1 & !(Race=="") & !(Race=="Unknown")]
nonvoters[, white:=0]; nonvoters[Race=="White", white:=1]; 
voters[, white:=0]; voters[Race=="White", white:=1]; 
t.test(nonvoters$white, voters$white)
#yeah, we can't tell these apart.

#make a table: this is Table 4 in SI section 2.4
finaldata[Race=="", Race:="Unknown"]
finaldata$factorrace <- factor(finaldata$Race, ordered=T, levels=c("White", "Black", "AmerIndian", "Hispanic", "Asian", "Unknown"))
voters[Race=="", Race:="Unknown"]
voters$factorrace <- factor(voters$Race, ordered=T, levels=c("White", "Black", "AmerIndian", "Hispanic", "Asian", "Unknown"))
nonvoters[Race=="", Race:="Unknown"]
nonvoters$factorrace <- factor(nonvoters$Race, ordered=T, levels=c("White", "Black", "AmerIndian", "Hispanic", "Asian", "Unknown"))

racetable <- cbind(table(finaldata$factorrace), table(nonvoters$factorrace), table(voters$factorrace)) #stick all three together in columns
racetable <-rbind(racetable, c(nrow(finaldata),nrow(nonvoters), nrow(voters))); rownames(racetable)[7]<- "Total" #add a total row
colnames(racetable) <- c("All Records", "Non-Voters", "Voters")
racetable
print(xtable(racetable, label="racecounts",
caption="Race (According to Prison Records)"), file="voterracetable.tex")

#gender? age? 
summary(finaldata$Age)
summary(finaldata$Gender)
table(nonvoters$Gender); table(voters$Gender)
summary(nonvoters$Age); summary(voters$Age)
t.test(nonvoters$Age, voters$Age)


#another table: this is Table 5 in SI section 2.4
finaldata[Gender=="" | Gender=="Other", Gender:="Other/Unknown"] #I'm collapsing these; I suspect they are actually using "other" to mean unknown, given that prisons are generally not very cool about gender identity
finaldata$factorgender <- factor(finaldata$Gender, ordered=T, levels=c("Male", "Female", "Other/Unknown"))
voters[Gender=="" | Gender=="Other", Gender:="Other/Unknown"]
voters$factorgender <- factor(voters$Gender, ordered=T, levels=c("Male", "Female", "Other/Unknown"))
nonvoters[Gender=="" | Gender=="Other", Gender:="Other/Unknown"]
nonvoters$factorgender <- factor(nonvoters$Gender, ordered=T, levels=c("Male", "Female", "Other/Unknown"))

gendertable <- cbind(table(finaldata$factorgender), table(nonvoters$factorgender), table(voters$factorgender)) #stick all three together in columns
gendertable <-rbind(gendertable, c(nrow(finaldata),nrow(nonvoters), nrow(voters))); rownames(gendertable)[4]<- "Total" #add a total row
colnames(gendertable) <- c("All Records", "Non-Voters", "Voters")
gendertable
print(xtable(gendertable, label="gendercounts",
caption="Gender (According to Prison Records)"), file="votergendertable.tex")

nonvoters[, female:=0]; nonvoters[Gender=="Female", female:=1]; 
voters[, female:=0]; voters[Gender=="Female", female:=1]; 
t.test(nonvoters$female, voters$female)

#overlapping histogram on age: this is Figure 1 in SI section 2.4
h1<-nonvoters$Age
h2<-voters$Age
pdf("voterage.pdf")
hist(h1, col=rgb(1,0,0,0.5), freq=F, main="Age (Non-Voters in Red, Voters in Blue)", xlab="Age") #code from here (modified): https://www.r-bloggers.com/overlapping-histogram-in-r/
hist(h2, col=rgb(0,0,1,0.5), freq=F, add=T)
dev.off()

#and something similar on remaining sentence time?
t.test(nonvoters$maxreleasetime, voters$maxreleasetime)

#####################################################################################################################################################
#and now, the deidentified Maine data:

##read in de-identified dataset of currently-incarcerated people
load("ME_incarc2018elec_deidentified.Rdata") #"maine1deid"
dim(maine1deid)
##simple descriptives: how many registered, how many voted in 2018?
summary(maine1deid$registered)
summary(maine1deid$turnout2018)

table(maine1deid$turnoutmethod) #so we've got some false positives, probably (or else recording errors in vote type)

##same deal now for recently-released people.  
load("ME_1617releases_deidentified.Rdata") #"maine2deid"
summary(maine2deid$registered)
summary(maine2deid$turnout2018)
